{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 二、编辑距离（Minimum Edit Distance，MED）\n",
    "- Levenshtein Distance 是用来度量两个序列相似程度的指标\n",
    "- 将单词 w1 转换为单词 w2 所需要的最少单字符编辑操作次数。单字符编辑操作如下三种：\n",
    "    - 插入\n",
    "    - 删除\n",
    "    - 替换\n",
    "- 如 \"kitten\" 和 \"sitting\"，编辑距离为 3：\n",
    "    - kitten → sitten（替换）\n",
    "    - sitten → sittin（替换）\n",
    "    - sittin → sitting（插入）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 求解编辑距离\n",
    "def edit_distance(str1, str2):\n",
    "    n = len(str1)\n",
    "    m = len(str2)\n",
    "\n",
    "    dp = [[0] * (m + 1) for _ in range(n + 1)]\n",
    "\n",
    "    for j in range(1, m + 1):\n",
    "        dp[0][j] = dp[0][j - 1] + 1\n",
    "    for i in range(1, n + 1):\n",
    "        dp[i][0] = dp[i - 1][0] + 1\n",
    "\n",
    "    for i in range(1, n + 1):\n",
    "        for j in range(1, m + 1):\n",
    "            if str1[i - 1] == str2[j - 1]:\n",
    "                dp[i][j] = dp[i - 1][j - 1]\n",
    "            else:\n",
    "                dp[i][j] = min(dp[i - 1][j], dp[i][j - 1],\n",
    "                               dp[i - 1][j - 1]) + 1\n",
    "    return dp[-1][-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-30T07:14:42.845797Z",
     "start_time": "2020-03-30T07:14:42.836859Z"
    }
   },
   "outputs": [],
   "source": [
    "# 求解编辑距离的同时，获取编辑路径\n",
    "def edit_distance(str1, str2):\n",
    "    matrix = [[i + j for j in range(len(str2) + 1)]\n",
    "              for i in range(len(str1) + 1)]\n",
    "\n",
    "    operation_matrix = [['' for j in range(len(str2) + 1)]\n",
    "                        for i in range(len(str1) + 1)]\n",
    "    for j in range(1, len(str2) + 1):\n",
    "        operation_matrix[0][j] = operation_matrix[0][j - 1] + 'ADD {};'.format(\n",
    "            str2[j - 1])\n",
    "    for i in range(1, len(str1) + 1):\n",
    "        operation_matrix[i][0] = operation_matrix[i - 1][0] + 'DEL {};'.format(\n",
    "            str1[i - 1])\n",
    "\n",
    "    for i in range(1, len(str1) + 1):\n",
    "        for j in range(1, len(str2) + 1):\n",
    "            if (str1[i - 1] == str2[j - 1]):\n",
    "                d = 0\n",
    "                operation = ''\n",
    "            else:\n",
    "                d = 1\n",
    "                operation = 'SUB {}=>{};'.format(str1[i - 1], str2[j - 1])\n",
    "\n",
    "            matrix[i][j], operation_matrix[i][j] = min(\n",
    "                (matrix[i - 1][j - 1] + d,\n",
    "                 operation_matrix[i - 1][j - 1] + operation),\n",
    "                (matrix[i - 1][j] + 1,\n",
    "                 operation_matrix[i - 1][j] + 'DEL {};'.format(str1[i - 1])),\n",
    "                (matrix[i][j - 1] + 1,\n",
    "                 operation_matrix[i][j - 1] + 'ADD {};'.format(str2[j - 1])))\n",
    "\n",
    "    return matrix[-1][-1], operation_matrix[-1][-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-30T07:14:46.531722Z",
     "start_time": "2020-03-30T07:14:46.523616Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6, 'ADD b;ADD e;ADD i;DEL e;SUB b=>n;SUB a=>g;')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edit_distance('jieba', 'beijing')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-30T07:19:37.149007Z",
     "start_time": "2020-03-30T07:19:37.135836Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2,\n",
       " {('A', 'A'): '',\n",
       "  ('A', 'AB'): 'ADD B',\n",
       "  ('A', 'ABC'): 'ADD C',\n",
       "  ('A', 'ABCC'): 'ADD C',\n",
       "  ('A', 'ABCCE'): 'ADD E',\n",
       "  ('A', 'ABCCEF'): 'ADD F',\n",
       "  ('AB', 'A'): 'DEL B',\n",
       "  ('AB', 'AB'): '',\n",
       "  ('AB', 'ABC'): 'ADD C',\n",
       "  ('AB', 'ABCC'): 'ADD C',\n",
       "  ('AB', 'ABCCE'): 'ADD E',\n",
       "  ('AB', 'ABCCEF'): 'ADD F',\n",
       "  ('ABC', 'A'): 'DEL C',\n",
       "  ('ABC', 'AB'): 'DEL C',\n",
       "  ('ABC', 'ABC'): '',\n",
       "  ('ABC', 'ABCC'): 'ADD C',\n",
       "  ('ABC', 'ABCCE'): 'ADD E',\n",
       "  ('ABC', 'ABCCEF'): 'ADD F',\n",
       "  ('ABCD', 'A'): 'DEL D',\n",
       "  ('ABCD', 'AB'): 'DEL D',\n",
       "  ('ABCD', 'ABC'): 'DEL D',\n",
       "  ('ABCD', 'ABCC'): 'SUB D => C',\n",
       "  ('ABCD', 'ABCCE'): 'ADD E',\n",
       "  ('ABCD', 'ABCCEF'): 'ADD F',\n",
       "  ('ABCDE', 'A'): 'DEL E',\n",
       "  ('ABCDE', 'AB'): 'DEL E',\n",
       "  ('ABCDE', 'ABC'): 'DEL E',\n",
       "  ('ABCDE', 'ABCC'): 'DEL E',\n",
       "  ('ABCDE', 'ABCCE'): '',\n",
       "  ('ABCDE', 'ABCCEF'): 'ADD F'})"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 记忆化求解\n",
    "from functools import lru_cache\n",
    "solution = {}\n",
    "\n",
    "\n",
    "@lru_cache(maxsize=2**10)\n",
    "def edit_distance(string1, string2):\n",
    "    if min(len(string1), len(string2)) == 0:\n",
    "        return max(len(string1), len(string2))\n",
    "\n",
    "    tail_s1 = string1[-1]\n",
    "    tail_s2 = string2[-1]\n",
    "\n",
    "    candidates = [\n",
    "        (edit_distance(string1[:-1], string2) + 1, 'DEL {}'.format(tail_s1)),\n",
    "        (edit_distance(string1, string2[:-1]) + 1, 'ADD {}'.format(tail_s2))\n",
    "    ]\n",
    "\n",
    "    if tail_s1 == tail_s2:\n",
    "        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 0, '')\n",
    "    else:\n",
    "        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 1,\n",
    "                        'SUB {} => {}'.format(tail_s1, tail_s2))\n",
    "\n",
    "    candidates.append(both_forward)\n",
    "\n",
    "    min_distance, operation = min(candidates, key=lambda x: x[0])\n",
    "    solution[(string1, string2)] = operation\n",
    "\n",
    "    return min_distance\n",
    "\n",
    "\n",
    "edit_distance('ABCDE', 'ABCCEF'), solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 自顶向下，递归\n",
    "def Levenshtein_Distance_Recursive(str1, str2):\n",
    "\n",
    "    if len(str1) == 0:\n",
    "        return len(str2)\n",
    "    elif len(str2) == 0:\n",
    "        return len(str1)\n",
    "    elif str1 == str2:\n",
    "        return 0\n",
    "\n",
    "    if str1[len(str1) - 1] == str2[len(str2) - 1]:\n",
    "        d = 0\n",
    "    else:\n",
    "        d = 1\n",
    "\n",
    "    return min(\n",
    "        Levenshtein_Distance_Recursive(str1, str2[:-1]) + 1,\n",
    "        Levenshtein_Distance_Recursive(str1[:-1], str2) + 1,\n",
    "        Levenshtein_Distance_Recursive(str1[:-1], str2[:-1]) + d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 自底向上，动态规划\n",
    "def Levenshtein_Distance(str1, str2):\n",
    "    \"\"\"\n",
    "    计算字符串 str1 和 str2 的编辑距离\n",
    "    :param str1\n",
    "    :param str2\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    matrix = [[i + j for j in range(len(str2) + 1)]\n",
    "              for i in range(len(str1) + 1)]\n",
    "\n",
    "    for i in range(1, len(str1) + 1):\n",
    "        for j in range(1, len(str2) + 1):\n",
    "            if (str1[i - 1] == str2[j - 1]):\n",
    "                d = 0\n",
    "            else:\n",
    "                d = 1\n",
    "\n",
    "            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1,\n",
    "                               matrix[i - 1][j - 1] + d)\n",
    "\n",
    "    return matrix[-1][-1]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
